# 1) model the ratings 
# 2) relationship between reviews and ratings
# 8) correlation between age and # restaurants/# reviews
# 9) correlation between Chinese population and # Chinese restaurants and # reviews on Chinese restaurants 


library("scales")
library("e1071")
library("ggplot2")
# 1) modeling the ratings 
# Yelp's business proposition is the word of mouth: user-generated reviews and ratings.
# How does rating work? You give a rating (1~5) when you review a restaurant on Yelp.
# Then Yelp presumably collects all the ratings for each restuarant and generate a 1-5 with 
# 0.5 increments. We have no knowledge about how actually Yelp computed these
# displayed ratings, but can we model the ratings?

rm(list = ls())
Yelp <- read.csv("DATA_JLH_CL.csv")

# set up the variables
rating<-Yelp$rating
review<-Yelp$num_reviews


# A basic box plot
qplot(y=rating)+geom_boxplot(fill="yellow",alpha=0.4)+
  ggtitle("Boxplot of sample restaurant ratings in Boston") + 
  ylab("Restaurant Ratings on Yelp (1-5 stars)" ) +
  theme(plot.title = element_text(size=20, face="bold", vjust=2))

plot of chunk unnamed-chunk-1

# there are no ratings less than 3 ?! Are ALL restaurants that good?
# look at a histogram of ratings - BONUS 6 (new plots)
hist.rating<-qplot(rating, geom = "blank") + 
  geom_histogram(aes(y = ..density..),breaks=seq(2.75,5.25,0.5),
                 , alpha = 0.4,colour="black",fill="white") + 
  ggtitle("Distribution of Restaurant Ratings on Yelp") + 
  xlab("Restaurant Ratings on Yelp") + 
  ylab("Density") +
  theme(plot.title = element_text(size=20, face="bold", vjust=2))
hist.rating+ stat_function(fun=dnorm, args=list(mean=mean(rating), sd=sd(rating)),
                           colour="#0072B2") +
  geom_vline(xintercept =quantile(rating,c(0.025,0.975)) , color="red", label="zero")

plot of chunk unnamed-chunk-1

# The distribution of ratings is so centered on the 3.5 to 4.5 and looks a bit normal. 
# Why so? 
# We suspect it is the CLT at work:
# for large enough samples, the distribution of the sum or mean is approxiamtely normal.
# If we assume ratings are rounded averages of all the ratings people give a restuarant, 
# then the more reviews/ratings (larger sample size), 
# the more normal the displayed rating scores (sample means) will be.

# Although we don't know the exact algorithm for computing the ratings, 
# we can verify our hypothesis by segmenting our sample by the number of reviews (n)

# We have three ways to measure how normal the ratings get when the sample size increases,
# or when we get rid of more smaller samples

# Approach 1: skewness and kurtosis study - BONUS 13
# We look at the skewness and kurtosis for different sample sizes

# define our own functions to automate the study - BONUS 11
# input: n - split sample into ratings with more than n and less than n reviews
# output: skewness and kurtosis for each segmented sample
CLT<-function(n) {
  index<-which(review<n)
  rating0<-rating[index]
  rating1<-rating[-index]
  clt<-c(skewness(rating0),skewness(rating1),kurtosis(rating0),kurtosis(rating1))
  clt
}
summary(review) # the maximum review is 2132, the minimum 7 so we can choose n from the range
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     7.0    65.5   138.0   206.0   267.0  2130.0
level<-seq(8,2132,1) # specify the level (n) by which we split the sample
clt<-data.frame(matrix(,ncol=0,nrow=0))
for (n in level) {
  clt<-rbind(clt,c(n,CLT(n)))
  colnames(clt)<-c("n","skewness less than n","skewness more than n",
                   "kurtosis less than n","kurtosis more than n")
}
head(clt)  # a table with skewness and kurtosis
##    n skewness less than n skewness more than n kurtosis less than n
## 1  8                  NaN             -0.02780                  NaN
## 2  9                  NaN             -0.02668                  NaN
## 3 10               0.7500             -0.04087              -1.6875
## 4 11               1.0733             -0.03984              -0.9200
## 5 12               0.0000             -0.03740               0.0625
## 6 13               0.2044             -0.04968               0.6266
##   kurtosis more than n
## 1            -0.012116
## 2            -0.006942
## 3            -0.023232
## 4            -0.018100
## 5            -0.010668
## 6            -0.013253
# Skewness study
#ggplot graph - BONUS 6
ggplot(clt, aes(x=level, y=clt[,2])) +
  geom_point(shape=1,col=rgb(1/2,0,1/4,1/4)) +
  geom_hline(yintercept = 0, color="yellow", label="zero") +
  ggtitle("Skewness values for ratings with more than n reviews") + 
  xlab("n number of reviews") + 
  ylab("Skewness") +
  theme(plot.title = element_text(size=20, face="bold", vjust=2)) 
## Warning: Removed 2 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-1

# this is very convincing, as n increases, the skewness approaches 0

ggplot(clt, aes(x=level, y=clt[,3])) +
  geom_point(shape=1,col=rgb(1,0,1/2,1/4)) +
  geom_hline(yintercept = 0, color="7", label="zero") +
  ggtitle("Skewness values for ratings with less than n reviews") + 
  xlab("n number of reviews") + 
  ylab("Skewness") +
  theme(plot.title = element_text(size=20, face="bold", vjust=2)) 
## Warning: Removed 415 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-1

# also approaches 0, as we get rid of the smaller samples

length(which(abs(clt[,2])<=0.5))/2132 # percentage of skewness within -0.5 and 0.5
## [1] 0.9934
length(which(abs(clt[,3])<=0.5))/2132 # percentage of skewness within -0.5 and 0.5
## [1] 0.8021
# Most of the skewness values are within -0.5 and 0.5, normality is quite a good fit.

#Kurtosis study
ggplot(clt, aes(x=level, y=clt[,4])) +
  geom_point(shape=1,col=rgb(0,0,1/4,1/4)) +
  geom_hline(yintercept = 0, color="black", label="zero") +
  ggtitle("Excess kurtosis for ratings with less than n reviews") + 
  xlab("n number of reviews") + 
  ylab("Excess kurtosis") +
  theme(plot.title = element_text(size=20, face="bold", vjust=2)) 
## Warning: Removed 2 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-1

# excess kurtosis approaches 0, as n increases; so normality is a good fit

# limitations of normality
ggplot(clt, aes(x=level, y=clt[,5])) +
  geom_point(shape=1,col=rgb(1/4,0,1,1/2)) +
  geom_hline(yintercept = 0, color="black", label="zero") +
  ggtitle("Excess kurtosis for ratings with more than n reviews") + 
  xlab("n number of reviews") + 
  ylab("Excess kurtosis") +
  theme(plot.title = element_text(size=20, face="bold", vjust=2)) 
## Warning: Removed 415 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-1

# as we get rid of the smaller samples, kurtosis is close to 0 when n< 500 but
# then strays away far from 0 when n > 500 
length(rating[which(review>500)]) # the observations with more than 500 reviews
## [1] 79
length(rating[which(review>1000)]) # the observations with more than 1000 reviews
## [1] 15
# since the observations in our data get fewer and fewer until exhausted.
# 500 is the limit beyond which the data are too scarce to be showing normality



# Approach 2: Central Limit Theorem (CLT) approximation 
# Since there are more ratings in the middle values, we wonder: 
# What exactly is the probability that a rating score is 3.5 to 4.5? 

# we define a function to automate our study
# input: n - divide the samples into ones with less than n reviews and ones with more than n reviews
# output:  for each divided sample,
# the exact proportion of ratings from 3.5 to 4.5;
# CLT approximation with continuity correction; 
# and error - how much in percentage CLT approximation misses the exact answer
CLTprob<-function(n) {
  index<-which(review<n)
  rating0<-rating[index]
  rating1<-rating[-index]
  exact0<-length(which((rating0<=4.5)&(rating0>=3.5)))/length(rating0)
  prob0<-pnorm(4.75, mean(rating0),sd(rating0))-pnorm(3.25, mean(rating0),sd(rating0)) 
  error0<-abs(prob0-exact0)/exact0
  exact1<-length(which((rating1<=4.5)&(rating1>=3.5)))/length(rating1)
  prob1<-pnorm(4.75, mean(rating1),sd(rating1))-pnorm(3.25, mean(rating1),sd(rating1)) 
  error1<-abs(prob1-exact1)/exact1
  cltprob<-cbind(c(exact0,exact1),c(prob0,prob1),c(error0,error1)*100)
  rownames(cltprob)<-c("more than n","less than n")
  colnames(cltprob)<-c("Exact probability","CLT approximation","Error in %")
  cltprob
}

# the probability of ratings from 3.5 to 4.5 for different levels 
CLTprob(8)[2,] # the original sample
## Exact probability CLT approximation        Error in % 
##            0.9719            0.9587            1.3667
CLTprob(30)
##             Exact probability CLT approximation Error in %
## more than n            0.9412            0.9097      3.345
## less than n            0.9748            0.9631      1.206
CLTprob(70)
##             Exact probability CLT approximation Error in %
## more than n            0.9660            0.9482      1.851
## less than n            0.9741            0.9625      1.192
CLTprob(100)   
##             Exact probability CLT approximation Error in %
## more than n            0.9740            0.9566      1.786
## less than n            0.9707            0.9601      1.089
CLTprob(200)
##             Exact probability CLT approximation Error in %
## more than n            0.9653            0.9493      1.657
## less than n            0.9836            0.9734      1.032
CLTprob(500)
##             Exact probability CLT approximation Error in %
## more than n            0.9707            0.9555    1.55768
## less than n            0.9873            0.9877    0.03835
CLTprob(1000)  
##             Exact probability CLT approximation Error in %
## more than n            0.9715            0.9582      1.377
## less than n            1.0000            0.9783      2.171
plot(1,1, type ="n", xlim = c(5,2100), ylim = c(0,10),
  ylab="Errors in %", main="CLT Approximation Errors in %" ,
  xlab="n, the number of reviews for the restaurants")
legend(1300,6.5, lty=c(1,1),c("Less than n","More than n"), lwd=c(2.5,2.5),col=c("black","red")) 
for (i in 10:2000) {
  points(i,CLTprob(i)[1,3])
  points(i,CLTprob(i)[2,3], col ="red")
} 

plot of chunk unnamed-chunk-1

# We observe from these tables that as the sample sizes increases,
# the errors get smaller, CLT approximations get better;
# if we get rid of the small samples until n=500, the CLT approximations also get better
# again, above 500 reviews the data get so few and the errors stray away.



# Approach 3: Graphical approach - BONUS 6 and 12 (Watch the rainbow curves!)
# We will fit a normal curve to different sample sizes from 30 to 500 reviews

# define functions that generate a accustomed histogram and overlay a normal curve
CLT.hist<-function(n){
  index<-which(review<n)
  rating0<-rating[index]
  hist.rating<-qplot(rating0, geom = "blank") + 
  geom_histogram(aes(y = ..density..),breaks=seq(2.75,5.25,0.5),
                 , alpha = 0.4,colour="black",fill="white") + 
  stat_function(fun=dnorm, args=list(mean=mean(rating0), sd=sd(rating0)),
                  color=sample(20,1)) + 
  ggtitle("Distribution of Restaurant Ratings on Yelp") + 
  xlab("Restaurant Ratings on Yelp") + 
  ylab("Density") +
  theme(plot.title = element_text(size=20, face="bold", vjust=2))
  hist.rating
}

# The normal curves are not a bad fit
CLT.hist(30)# ratings with less than 30 reviews

plot of chunk unnamed-chunk-1

CLT.hist(70)# ratings with less than 70 reviews

plot of chunk unnamed-chunk-1

CLT.hist(100)# ratings with less than 100 reviews

plot of chunk unnamed-chunk-1

CLT.hist(200)# ratings with less than 200 reviews

plot of chunk unnamed-chunk-1

CLT.hist(300) # ratings with less than 300 reviews

plot of chunk unnamed-chunk-1

CLT.hist(500) # ratings with less than 500 reviews

plot of chunk unnamed-chunk-1

CLT.hist(2132) # the entire sample

plot of chunk unnamed-chunk-1

# Also note the shift of distribution from left-skewed to right-skewed:
# the "shoulders" of the histograms move from one way to the other, like breakdancing!


# If we overlay these normal curves, we discover something interesting going on:

hist.rating# set the historgram of the entire sample as the background

plot of chunk unnamed-chunk-1

CLT.curve<-function(g,n,col){
  index<-which(review<n)
  rating0<-rating[index]
  hist.rating<-g+ stat_function(fun=dnorm, args=list(mean=mean(rating0), sd=sd(rating0)),
                  color=col) 
  hist.rating
}

rainbow<-c("Red","Orange","Yellow",'Green','Blue','Violet','magenta')

g1<-CLT.curve(hist.rating,30,rainbow[1]);g1 # ratings with less than 30 reviews

plot of chunk unnamed-chunk-1

g2<-CLT.curve(g1,70,rainbow[2]);g2 # ratings with less than 70 reviews

plot of chunk unnamed-chunk-1

g3<-CLT.curve(g2,100,rainbow[3]);g3 # ratings with less than 100 reviews

plot of chunk unnamed-chunk-1

g4<-CLT.curve(g3,200,rainbow[4]);g4 # ratings with less than 200 reviews

plot of chunk unnamed-chunk-1

g5<-CLT.curve(g4,300,rainbow[5]);g5 # ratings with less than 300 reviews

plot of chunk unnamed-chunk-1

g6<-CLT.curve(g5,500,rainbow[6]);g6 # ratings with less than 500 reviews

plot of chunk unnamed-chunk-1

g<-CLT.curve(g5,2132,rainbow[7]);g # our entire sample

plot of chunk unnamed-chunk-1

# As the sample size increases, the rainbow curves shift to the left! 
# That means, as the number of reviews increase, the ratings tend to decrease!
# To dig into the truth, we will look at the correlation between review and ratings later. 


# With the 3 approaches to model the ratings, we verified our CLT hypothesis: 
# With enough reviews, a normal distribution could have generated the ratings in our sample!
# If Yelp ignored the users' ratings, could they have used this distribution for the ratings?
# We have found a relationship that might have not been statistically significant!
# BONUS 9

# REQUIREMENT 2 - Student t confidence interval:
# What is the average rating of Bostonian restaurants?
# Now that we assume each rating is a sample mean of the ratings for each restaurant, 
# and show that they are quite normally distributed; with a large sample of these means,
# we naturally want a student t confidence interval for the whole population in Boston.
student<-(rating-mean(rating))/(sd(rating)/sqrt(length(rating)))
hist(student,breaks=seq(-150,150,20),freq=FALSE,main="A t distribution?",xlab="rescaled student variable")
curve(dt(x,n-1),col='red',add=TRUE) # does not fit at all

plot of chunk unnamed-chunk-1

# We failed! Why?
# This is because our ratings are discrete values, and are not strictly normal variables,
# so we are not justified to use student t confidence interval 
# but for the sake of comparison later, we can bend the theory: 
classic.t<-t.test(rating,conf.level=.99);classic.t 
## 
##  One Sample t-test
## 
## data:  rating
## t = 342.3, df = 998, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 99 percent confidence interval:
##  3.93 3.99
## sample estimates:
## mean of x 
##      3.96
# a very small interval with so many degrees of freedom!
# we are 99% confident that the true mean ratings is practically 4. Quite high a number!

# with a large sample, the best is to use a boostrap t confidence interval
xbar <- mean(rating); xbar  #sample mean
## [1] 3.96
S <- sd(rating); S          #sample standard deviation
## [1] 0.3656
n <- length(rating);n
## [1] 999
SE <- S/(sqrt(n)) ; SE       #sample standard error
## [1] 0.01157
#Check our methodology with a single bootstrap resample
x <-sample(rating, size = n, replace = TRUE) #resample
Tstar<-(mean(x) - xbar)/(sd(x)/sqrt(n)); Tstar #a t statistic
## [1] -0.3896
#Now we will estimate the distribution of the t statistic

N = 10^4; Tstar = numeric(N) #vector of t statistics
means = numeric(N); StdErrs = numeric(N)
for (i in 1:N) {
  x <-sample(rating, size = n, replace = TRUE)
  Tstar[i] <-(mean(x) - xbar)/(sd(x)/sqrt(n))
  means[i] = mean(x); StdErrs[i] = sd(x)/sqrt(n)
}
#The bootstrap t statistic is approximately normal except for the tails
qqnorm(Tstar)
qqline(Tstar)

plot of chunk unnamed-chunk-1

# BONUS 6 - new plots
qplot(Tstar, geom = "blank") + 
  geom_histogram(aes(y = ..density..)
                 , alpha = 0.4,colour="black",fill="white") + 
  stat_function(fun=dt, args=n-1,
                colour="blue") +
  ggtitle("Boostrap Distribution of Ratings") + 
  xlab("Boostrap statistics") + 
  ylab("Density") +
  theme(plot.title = element_text(size=20, face="bold", vjust=2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-1

#A Student t curve is quite good a match except for the center
#To get a confidence interval, use the bootstrap quantiles along with the sample mean and standard deviation
q<-quantile(Tstar, c(.005, .995),names=FALSE)
L <- xbar - q[2]*SE; U <- xbar - q[1]*SE; L; U
## [1] 3.931
## [1] 3.989
#Here, for comparison, is the bootstrap percentile and the classical t confidence interval
quantile(means, c(.005, .995),names=FALSE)
## [1] 3.931 3.989
classic.t$conf.int
## [1] 3.93 3.99
## attr(,"conf.level")
## [1] 0.99
# we can display these 3 intervals
hist(rating, breaks=seq(2.75,5.25,0.5), xlim=c(2.5,5.5),freq=FALSE,
     main="Distribution of sample restaurant ratings in Boston") # the whole sample
abline(v=c(L,U),col='black',lty=3)
abline(v=quantile(means, c(.005, .995),names=FALSE),col='red',lty=4)
abline(v=classic.t$conf.int,col='blue',lty=5)

plot of chunk unnamed-chunk-1

# these 3 intervals are so close to each other that we may practically use any one
# but theoretically, we are only allowed to use the boostrap t statistics.
# so we can claim with 99% confidence that the true mean rating of Bostonian restaurants is 4!
# Good job, Boston! 
# Well, now we also have a sense of how inflated the ratings can be.  

# BONUS 14 - Chi-square distribution  
# As I recall from W7 section problem 2, we rescaled a discrete random variable X~binom
# and produced chi-square statistics out of it with the student t approach; 
# Knowing that our ratings are roughly normal but discrete, 
# are we able to theoretically replicate Chi-square distributions?

mu<-mean(rating);mu # unbiased estimator of the mean of the population
## [1] 3.96
sigma<-sqrt((n-1)/n*var(rating)); sigma # unbiased estimator of signma of the population
## [1] 0.3654
x<-(sample(rating, 10,replace=T)-mu)/sigma; x # we make it a standard normal variable
##  [1]  0.1096  0.1096 -1.2587  0.1096  0.1096 -1.2587  0.1096  0.1096
##  [9]  1.4779 -1.2587
# let's simulate 10000 times
# large sample is a data luxury: we can choose a small sample this time
df<-50
N<-10000; means <-numeric(N); vars<-numeric(N); sumsq <- numeric(N) 
for (i in 1:N) {
  x<-(sample(rating,df,replace=FALSE)-mu)/sigma
  means[i] <- mean(x) 
  vars[i]<- var(x)     
  sumsq[i]<-sum(x^2)
}

hist(means^2*df,freq=FALSE,main="Chi-square Distribution") # this quantity is called W in proof 1 of week 7
curve(dchisq(x,1),col='green',add=TRUE) # Big surpise! it fits well!

plot of chunk unnamed-chunk-1

hist(sumsq,freq=FALSE,main="Chi-square Distribution")  # this quantity is called U in proof 1 of week 7 
curve(dchisq(x,df),col='red',add=TRUE) # Wow, it fits nicely

plot of chunk unnamed-chunk-1

hist(vars*(df-1),freq=FALSE,main="Chi-square Distribution") # this quantity is called V in proof 1 of week 7 
curve(dchisq(x,df-1),col='violet',add=TRUE) # again, fits well

plot of chunk unnamed-chunk-1

cor(means^2,vars) # highly uncorrelated 
## [1] 0.01869
hist(means/sqrt(vars/df),freq=FALSE,main="Student t Distribution") # the quantity T=(X.bar-mu)/(S/sqrt(n))
# we painstakingly proved in class!
curve(dt(x,df-1),col="green",add=TRUE) # the fit is ok

plot of chunk unnamed-chunk-1

# This surprising connection with Chi-square distributions might qualify for BONUS 9


# 2) BONUS 16 and REQUIREMENT 1: correlation between reviews and ratings
# As we observe earlier, the normal curves shift towards the left as reviews increase.
# Do more reviews tend to generate lower ratings?
plot(review,rating,xlab="Number of Reviews",ylab='Ratings',main="Scatter plot of ratings against reviews")
cor(review,rating) # they seem to be negatively correlated
## [1] -0.1098
ReviewRating.lm <- lm(rating~num_reviews, data =Yelp);ReviewRating.lm
## 
## Call:
## lm(formula = rating ~ num_reviews, data = Yelp)
## 
## Coefficients:
## (Intercept)  num_reviews  
##    3.997697    -0.000183
intercept<-coef(ReviewRating.lm)[1]
slope<-coef(ReviewRating.lm)[2] 
abline(intercept,slope,col='blue') # downward slope
# Fit a non-linear line
lines(smooth.spline(rating~review), col = "red")
legend(1000,5, lty=c(1,1),c("linear regression","non-linear"), lwd=c(2.5,2.5),col=c("blue","red")) 

PredictRating <- intercept + slope * review
points(review,PredictRating, col = "green",add = TRUE, pch = 20)  #these lie on the regression line
## Warning: "add" is not a graphical parameter

plot of chunk unnamed-chunk-1

ResidRating <- rating - PredictRating
plot(review,ResidRating,main="Residual plot of ratings against reviews")    # this is clearly not random 
abline(h=0, col = "red")

plot of chunk unnamed-chunk-1

summary(ReviewRating.lm)$r.squared # BONUS 13 - R^2
## [1] 0.01206
# only 1% data is explained by our model;our model clearly does not work 

#Now do 5000 permutations to see whether the actual beta could arise by chance.
N <- 4999; n <- nrow(Yelp)
cor.perm <- numeric(N); beta.perm <- numeric(N)
for (i in 1:N) {
  index <- sample(n, replace = FALSE) 
  rating.perm <- rating[index]
  beta.perm[i] <-coef(lm(rating.perm~ Yelp$num_reviews))[2]  #beta
  cor.perm[i] <-cor(Yelp$num_reviews, rating.perm) #correlation
}
hist(beta.perm, xlim = c(-0.0005,0.0005))
abline(v = slope, col = "red")    

plot of chunk unnamed-chunk-1

# although off the chart, our beta is so close to zero that our linear model fails 

hist(cor.perm, xlim = c(-0.2,0.2))
abline(v = cor(review,rating), col = "red")    # off the charts

plot of chunk unnamed-chunk-1

# although significant, the correlation between ratings and reviews is quite weak!





# 3) Correlation between age and activitiy of reviews
# With census data about the population relating to the Yelp restaurant data we have,
# we wonder about the correlation between age and the number of reviews:
# Does the population of 20-to-40-year-old correlate with 
# the number of restaurants reviewed on Yelp?

# Create by zip code data frame of # restaurants, # of people
byZip <- data.frame(table(Yelp$zip));head(byZip)
##   Var1 Freq
## 1 2108   28
## 2 2109   21
## 3 2110   21
## 4 2111   46
## 5 2113   34
## 6 2114   21
add_column <- numeric(nrow(byZip))
byZip <- data.frame(byZip, add_column);head(byZip)
##   Var1 Freq add_column
## 1 2108   28          0
## 2 2109   21          0
## 3 2110   21          0
## 4 2111   46          0
## 5 2113   34          0
## 6 2114   21          0
colnames(byZip) <- c("zip", "num_rest", "age")
for(i in 1:length(byZip$zip))
{
  # each zip code has the same population, whichever row that zip code appears in
  #  so just take the value in the first record returned by this which
  byZip$age[i] = Yelp$perct_20.40[min(which(Yelp$zip == byZip$zip[i]))]
}

# make a scatter plot
plot(byZip$age, byZip$num_rest,col=rgb(1/2,0,1/2,1),xlab="% population of 20-to-40-year-old by zipcode",
     ylab="Number of restaurants by zipcode",main="Correlation between 20s-to-40s and # of restaurants") 
# looks roughly like positive correlation

# check correlation of these two independent variables - BONUS POINT #16
our_corr <- cor(byZip$age, byZip$num_rest); our_corr
## [1] 0.5564
# .56 indicates positive correlation

# find a regression line
rest_by_age <- lm(byZip$num_rest~byZip$age, data=byZip); rest_by_age
## 
## Call:
## lm(formula = byZip$num_rest ~ byZip$age, data = byZip)
## 
## Coefficients:
## (Intercept)    byZip$age  
##      -10.96         0.78
intercept<-coef(rest_by_age)[1]
slope<-coef(rest_by_age)[2] 
# this tells us that, for every additional percentage of 20-to-40-year-old living within a zipcode,
#   we will have about roughly 1 more restaurant for that zipcode.
abline(rest_by_age, col="blue")
legend(20,57, lty=c(1,1),c("linear regression line","non-linear line"), lwd=c(2.5,2.5),col=c("blue","red")) 

# let's just check that something nonlinear isn't going on:
lines(smooth.spline(byZip$num_rest~byZip$age), col="blue")

plot of chunk unnamed-chunk-1

# wow, it overlapps nicely with our linear model!
summary(rest_by_age)$r.squared # R^2 is 30%
## [1] 0.3096
# 30% of the data is explained by our linear model 

# To use the t distribution, let's first check if we have independent, normally distributed residuals
predictions <- intercept + slope * byZip$age
residuals <- byZip$num_rest - predictions
plot(byZip$age, residuals)
abline(h=0, col="red")

plot of chunk unnamed-chunk-1

# it looks pretty randomly distributed except a few outliers in the 40-50% range

# we can also bootstrap to create confidence interval.  (for BONUS 8)
N <- 10^4
cor.boot <- numeric(N)
slope.boot <- numeric(N)
n <- nrow(byZip)

get_intercept <- function(X, Y)
{
  mean(Y) - slope*mean(X)
}

get_slope <- function(X, Y)
{
  sum(( X - mean(X)) * ( Y - mean(Y)) / sum((X - mean(X))^2))
}

for(i in 1:N)
{
  index <- sample(1:n, n, replace = TRUE)
  data.boot <- byZip[index, ]
  cor.boot[i] <- cor(data.boot$num_rest, data.boot$age)
  slope.boot[i] <- get_slope(data.boot$age, data.boot$num_rest)
}


 
# REQUIREMENT 3 
# Display a 95% Confidence Interval for the correlation
# between the population % and number of restaurants
hist(cor.boot,freq=FALSE,main="Boostrap distribution of correlation coefficients")
abline(v=quantile(cor.boot, c(.025, .975)), col="blue")
abline(v=our_corr, col="red")

plot of chunk unnamed-chunk-1

quantile(cor.boot, c(.025, .975))
##   2.5%  97.5% 
## 0.3854 0.7110
# Find a 95% Confidence Interval for the slope of our model
hist(slope.boot,main="Boostrap distribution of slope of our linear model")
abline(v=quantile(slope.boot, c(.025, .975)), col="blue")
abline(v=slope, col="red")

plot of chunk unnamed-chunk-1

quantile(slope.boot, c(.025, .975))
##   2.5%  97.5% 
## 0.5119 1.0801
# 4) correlation between Asian population and # Chinese restaurants 
# I love Chinese cuisine but I struggled to locate a Chinese restaurant nearby,
# so a question arises naturally: Am I living not close enough to where my food lives?
# Statistically, this question translates into the following:
# Is there any correlation between Asian population and # Chinese restaurants 

asian<-Yelp$perct_Asian
category<-Yelp$category
index<-which(Yelp$category=="chinese")
chinese<-Yelp[index,] # select all the Chinese restaurants 
# Create by zip code data frame of # restaurants, % of Asian
byZip <- data.frame(table(chinese$zip));head(byZip)
##   Var1 Freq
## 1 2108    1
## 2 2110    1
## 3 2111   20
## 4 2118    1
## 5 2134    4
## 6 2135    2
add_column <- numeric(nrow(byZip))
byZip <- data.frame(byZip, add_column);head(byZip)
##   Var1 Freq add_column
## 1 2108    1          0
## 2 2110    1          0
## 3 2111   20          0
## 4 2118    1          0
## 5 2134    4          0
## 6 2135    2          0
colnames(byZip) <- c("zip", "num_rest", "asian")
for (i in 1:length(byZip$zip))
{
  # each zip code has the same Asian population, whichever row that zip code appears in
  #  so just take the value in the first record returned by this which
  byZip$asian[i] <- Yelp$perct_Asian[min(which(Yelp$zip == byZip$zip[i]))]  
}

# make a scatter plot
plot(byZip$num_rest,byZip$asian,main="Correlation: Asian population vs Chinese restaurants",
     ylab="Asian population % by zipcode",xlab="# of Chinese restuarants by zipcode") #
cor(byZip$num_rest,byZip$asian) # very strong postive correlaton!
## [1] 0.8461
# find a regression line
rest_by_asian <-lm(byZip$asian~byZip$num_rest); rest_by_asian
## 
## Call:
## lm(formula = byZip$asian ~ byZip$num_rest)
## 
## Coefficients:
##    (Intercept)  byZip$num_rest  
##           9.34            1.57
intercept<-coef(rest_by_asian)[1]
slope<-coef(rest_by_asian)[2]
# this tells us that, for every additional percentage of asian living in a zipcode,
#   we will have about roughly 1.5 more Chinese restaurant for that zipcode.
abline(rest_by_asian, col="blue")
legend(10,17, lty=c(1,1),c("linear regression line","non-linear line"), lwd=c(2.5,2.5),col=c("blue","red")) 
# let's just check that something nonlinear isn't going on:
lines(smooth.spline(byZip$asian~byZip$num_rest), col="red")

plot of chunk unnamed-chunk-1

# Wow, it overlapps perfectly with our linear model!
summary(rest_by_asian) # R^2 is 72%! 72% of the data is explained by our model
## 
## Call:
## lm(formula = byZip$asian ~ byZip$num_rest)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.235 -2.675 -0.324  1.900  9.806 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       9.337      1.500    6.23  3.1e-05 ***
## byZip$num_rest    1.569      0.274    5.72  7.0e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.95 on 13 degrees of freedom
## Multiple R-squared:  0.716,  Adjusted R-squared:  0.694 
## F-statistic: 32.8 on 1 and 13 DF,  p-value: 7.02e-05
# this finding fits with our intuition, and clearly I have not been living in Asian community

# 5)
# Logistic regression - BONUS 4
# Local businesses can choose to "claim" their business on Yelp, maintaining their Yelp
# profiles and hoping to increase their popularity. But we challenge this assumption:
# Does claiming your business on Yelp correlate with publicity?

Yelp$claimed<-as.logical(toupper(Yelp$claimed))
claimed<-Yelp$claimed
fit <- glm(claimed ~ num_reviews, data = Yelp, family = binomial)
coef(fit)    # the coefficients as a vector
## (Intercept) num_reviews 
##    0.713716    0.001052
# Calculation: my logistic equation is ln(p/1-p)=0.713716164+0.001051541*x

# we can plot the probability of whether a business is claimed against number of reviews
x <- seq(7, 2132, length=500) # vector spanning the #review range
# compute predicted probabilities
y1 <- exp(coef(fit)[1]+coef(fit)[2]*x) / (1+exp(coef(fit)[1]+coef(fit)[2]*x)) 
y2 <- plogis(coef(fit)[1] + coef(fit)[2] * x)
plot(Yelp$num_reviews,Yelp$claimed, xlab='number of reviews',main="Correlation: claiming your business vs publicity",
     ylab = "Probability of claimed your business")
lines(x, y2,col='blue') # our logistic regression curve
legend(1000,0.5, lty=1,"Built-in glm function", lwd=2.5,col="blue") 

# BONUS 15 - Maximum likelihood estimation 
# MLE also gives us the parameters
library(stats4)
x<-Yelp$num_reviews;y<-Yelp$claimed
MLL<- function(alpha, beta) -sum( log( exp(alpha+beta*x)/(1+exp(alpha+beta*x)) )*y+ 
                                    log(1/(1+exp(alpha+beta*x)))*(1-y) )
results<-mle(MLL,start = list(alpha = -0.1, beta = -0.02))
results@coef
##    alpha     beta 
## 0.693355 0.001159
curve( exp(results@coef[1]+results@coef[2]*x)/ (1+exp(results@coef[1]+results@coef[2]*x)),col = "red", add=TRUE)
legend(1000,0.5, lty=c(1,1),c("Built-in glm function","MLE estimation"), lwd=c(2.5,2.5),col=c("blue","red")) 

plot of chunk unnamed-chunk-1

# almost overlays the curve as we got from the glm function 


# BONUS 8 + Technical Display 3: We can make boostrap confidence intervals too
N <- 10^3
n <- length(claimed)                   # number of observations
alpha.boot <- numeric(N)
beta.boot <- numeric(N)
for (i in 1:N)
{
  index <- sample(n, replace = TRUE)
  claimed.boot <- Yelp[index, ]     # resampled data
  fit.boot <- glm(claimed ~ num_reviews, data = claimed.boot,
                  family = binomial)
  alpha.boot[i] <- coef(fit.boot)[1]    # new intercept
  beta.boot[i] <- coef(fit.boot)[2]     # new slope
}
quantile(beta.boot, c(.025, .975))      # 95% percentile intervals for the slope
##     2.5%    97.5% 
## 0.000187 0.002259
#Now we can look at the distribution of the resampled results
hist(beta.boot,,main="Boostrap distribution of beta",freq=FALSE)  
abline(v=quantile(beta.boot, c(.025, .975)),col="green") #95% confidence interval for beta
abline(v=coef(fit)[2],col="blue") # the observation

plot of chunk unnamed-chunk-1

quantile( beta.boot, c(.025, .975)) # the bootstrap confidence interval for beta 
##     2.5%    97.5% 
## 0.000187 0.002259
hist(alpha.boot,freq=FALSE,main="Boostrap distribution of alpha")  
abline(v=quantile( alpha.boot, c(.025, .975)),col="red") #95% confidence interval for alpha
abline(v=coef(fit)[1],col="blue") # the observation

plot of chunk unnamed-chunk-1

quantile( alpha.boot, c(.025, .975)) # the bootstrap confidence interval for alpha
##   2.5%  97.5% 
## 0.4799 0.9241